This code is assembled to perform the steps to summarize differentially expressed genes and clusters of cells. The processed data were downloaded and saved in SEUSS_processed_data.
This cade has been pulled from https://github.com/yanwu2014/SEUSS-Analysis and specifically copied from hESC_summarize_results.R
The data for cells cultured in the pluripotent stem cell medium has been put into a compressed file in the Google Drive. You should be able to download it from the following link: https://drive.google.com/open?id=19qHsRFO4QwHhotxuw73MYRHB8CFYPM3c and extract the files onto your local computer. You will also need to change the file.handle appropriately (see below).
The following packages must be installed prior to running (many needed only to load swne package. Uncomment to execute.
getPackageIfNeeded <- function(pkg) {
if (!require(pkg, character.only=TRUE, quietly =TRUE))
install.packages(pkgs=pkg, dependencies=TRUE)
}
getBiocPackageIfNeeded <- function(pkg, vers = NULL) {
if (!require(pkg, character.only=TRUE, quietly =TRUE))
BiocManager::install(pkgs=pkg, version=vers)
}
bcpkgs <- list(list(pkg="BiocManager",vers="3.10"),
list(pkg="TxDb.Hsapiens.UCSC.hg38.knownGene",vers="3.10"),
list(pkg="org.Hs.eg.db"),
list(pkg="GO.db"),
list(pkg="qvalue", vers="3.10")
)
invisible(lapply(bcpkgs, function(x)
do.call(getBiocPackageIfNeeded, args=x)))
pkgs <- c("devtools","uwot","umap","Seurat")
invisible(sapply(pkgs,getPackageIfNeeded))
if (!require("NNLM"))
devtools::install_version("NNLM", version = "0.4.3")
if (!require("swne"))
devtools::install_github("yanwu2014/swne")
if (!require("perturbLM"))
devtools::install_github("yanwu2014/perturbLM", upgrade = "never")
Set the file.handle parameter to the name of the sample to be analyzed (this can include path information if data not in the same directory as this file). Generate a summary table for each sample, as well as cluster enrichment heatmaps, tSNE plots, and differential expression barplots.
The hESC_run_regression.R and hESC_run_clustering.R must have been run on the sample first. (This has already been done by Parekh et al.)
Load all required libraries.
#### Create t-SNE plots, differential expression heatmaps, and cluster enrichment heatmaps
library(perturbLM)
library(swne)
library(ggplot2)
library(RColorBrewer)
library(Seurat)
library(grid)
library(gtable)
library(umap)
Define file locations and load data. Example below includes path information for my computer. Must be modified to work appropriately on your computer.
## Name of the sample to be analyzed
file.handle <- "/Users/vagan/Downloads/Parekh_paper_data/up-tf-stem"
#file.handle <- "up-tf-stem"
# file.handle <- "up-tf-endo"
# file.handle <- "up-tf-multi"
# file.handle <- "up-tf-klf"
# file.handle <- "up-tf-myc"
# file.handle <- "up-tf-neuron-nohygro"
## Load regression results
coefs.df <- read.table(paste(file.handle, "regression.pvals.tsv", sep = "_"), sep = "\t",
header = T, stringsAsFactors = F)
top.coefs.df <- subset(coefs.df, FDR < 0.05 & abs(cf) > 0.025)
print(sort(table(top.coefs.df$Group), decreasing = T))
KLF4 CDX2 SNAI2 ONECUT1 MYOG ASCL1 NEUROD1
487 272 226 132 122 121 112
MYOD1 TBX5 MYC HNF1A RUNX1 NEUROG3 ASCL4
101 55 50 47 45 44 39
MESP1 NEUROG1 LHX3 ASCL3 OTX2 SPI1 TFAP2C
36 35 33 31 30 28 27
SOX2 PAX7 FOXA3 ASCL5 CRX HAND2 HOXA1
23 22 20 19 18 18 18
SPIC SOX3 FOXA2 SOX10 HOXA11 POU1F1 GATA4
17 16 15 15 14 13 12
SIX2 ETV2 ESRRG GATA2 HOXB6 SPIB GATA1
11 9 6 6 5 4 3
MEF2C MITF SIX1 ERG HOXA10 LMX1A mCherry-int-ctrl
3 3 3 2 2 2 2
MYCL SRY ATF7 FOXP1 NRL
2 2 1 1 1
## Load clustering
r <- readRDS(paste(file.handle, "clustering.Robj", sep = "_"))
clusters <- r@ident; names(clusters) <- r@cell.names;
levels(clusters) <- paste("C", levels(clusters), sep = "")
clusters.list <- UnflattenGroups(clusters)
# update v2 Seurat object to v3
r <- UpdateSeuratObject(r)
Updating from v2.X to v3.X
Validating object structure
Updating object slots
Ensuring keys are in the proper strucutre
Ensuring feature names don't have underscores or pipes
Object representation is consistent with the most current Seurat version
## Load genotypes
# genotypes.list <- ReadGenotypes("up-tf-klf-myc_pheno_dict.csv")
genotypes.list <- ReadGenotypes(paste(file.handle, "pheno_dict.csv", sep = "_"))
genotypes.list <- lapply(genotypes.list, function(x) x[x %in% names(clusters)])
genotypes.sizes <- sapply(genotypes.list, length)
Calculate enrichment scores and summarize results.
## Calculate genotype enrichment
genotypes.cl.tbl <- GenotypeClusterCounts(genotypes.list, clusters.list)
genotypes.cl.pvals <- GenotypeClusterPvals(genotypes.cl.tbl)
genotypes.min.p <- apply(genotypes.cl.pvals, 1, function(x) min(abs(x)) * length(x))
metap::sumlog(genotypes.min.p)
Some studies omitted
chisq = 3750.279 with df = 118 p = 0
## Summarize analysis results
genotype.counts <- sort(table(top.coefs.df$Group), decreasing = T)
genotype.counts <- genotype.counts[names(genotype.counts) %in% rownames(genotypes.cl.pvals)]
genotype.avg_cf <- tapply(top.coefs.df$cf, top.coefs.df$Group, function(x) mean(abs(x)))[names(genotype.counts)]
genotype.summary <- data.frame(n_diff_genes = as.integer(genotype.counts), avg_cf = genotype.avg_cf)
genotype.summary$min_cluster_pval <- apply(genotypes.cl.pvals[rownames(genotype.summary), ], 1, function(x) min(abs(x)))
genotype.summary$min_cluster <- apply(genotypes.cl.pvals[rownames(genotype.summary), ], 1, function(x) names(x[which.min(abs(x))]))
genotype.summary$n_cells <- genotypes.sizes[rownames(genotype.summary)]
Uncomment code below if you want to save the output data to a file.
# write.table(genotype.summary, file = paste(file.handle, "summary.tsv", sep = "_"), sep = "\t")
Continue processing to identify significant genotypes.
## Determine significant genotypes
min.cl.pval <- 1e-12
min.diff.genes <- 40
ctrl.cl <- genotype.summary["mCherry", "min_cluster"]
sig.genotypes <- rownames(subset(genotype.summary, (abs(min_cluster_pval) < min.cl.pval & min_cluster != ctrl.cl)
| n_diff_genes > min.diff.genes))
sig.genotypes <- sort(unique(c(sig.genotypes, "mCherry-int-ctrl")), decreasing = T); print(sig.genotypes);
[1] "TBX5" "SNAI2" "RUNX1" "ONECUT1" "NEUROG3" "NEUROG1"
[7] "NEUROD1" "MYOG" "MYOD1" "MYC" "mCherry-int-ctrl" "KLF4"
[13] "HNF1A" "CDX2" "ASCL4" "ASCL1"
Generate barplot graph.
## Differential expression barplot
n.diff.genes <- genotype.summary[sig.genotypes, "n_diff_genes"]
names(n.diff.genes) <- sig.genotypes
gg.bar <- ggBarplot(n.diff.genes, fill.color = "purple")
# pdf(paste(file.handle, "diff_genes_barplot.pdf", sep = "_"), width = 3.5, height = 1.5)
# print(gg.bar)
# dev.off()
## Cluster enrichment heatmap (Figure 1c-e)
max.lp <- 50
genotypes.cl.lp <- apply(genotypes.cl.pvals[sig.genotypes,], 1:2, function(x) ifelse(x > 0, -log10(x), log10(-1 * x)))
genotypes.cl.lp[genotypes.cl.lp > max.lp] <- max.lp
genotypes.cl.lp[genotypes.cl.lp < -1 * max.lp] <- -1 * max.lp
# pdf(paste(file.handle, "cl_enrich_heatmap.pdf", sep = "_"), width = 4.5, height = 5)
# ggHeat(genotypes.cl.lp, clustering = "none", x.lab.size = 14, y.lab.size = 14)
# dev.off()
gg.heat <- ggHeat(t(genotypes.cl.lp), clustering = "none", x.lab.size = 14, y.lab.size = 14)
gg.bar <- gg.bar + theme(axis.text.x = element_blank())
gg.heat <- gg.heat + theme(legend.position = "none")
gg.1 <- ggplotGrob(gg.heat)
gg.2 <- ggplotGrob(gg.bar)
gg.aligned <- rbind(gg.2, gg.1, size = "first")
gg.aligned$widths <- grid::unit.pmax(gg.1$widths, gg.2$widths)
#pdf(paste(file.handle, "stacked_diff_genes_cl_enrich_nolabels.pdf", sep = "_"), width = 5.5, height = 6)
grid::grid.newpage()
grid::grid.draw(gg.aligned)
#dev.off()
Generate tSNE plots
## tSNE plots (Figure 1c-e)
plot.seed <- 32907098
tsne.emb <- Embeddings(object = r, reduction = "tsne")
#pdf(paste(file.handle, "tsne_plot.pdf", sep = "_"), width = 4, height = 4)
PlotDims(tsne.emb, sample.groups = clusters, pt.size = 0.75, alpha.plot = 0.5, label.size = 6, do.label = T,
show.legend = F, seed = plot.seed)
#dev.off()
#pdf(paste(file.handle, "tsne_plot_nolabel.pdf", sep = "_"), width = 4, height = 4)
PlotDims(tsne.emb, sample.groups = clusters, pt.size = 0.75, alpha.plot = 0.5, label.size = 0, do.label = F,
show.legend = F, seed = plot.seed)
#dev.off()
#pdf(paste(file.handle, "tsne_plot_batch.pdf", sep = "_"), width = 4, height = 4)
batch <- factor(r@meta.data$batch)
# names(batch) <- r@cell.names
names(batch) <- colnames(r)
batch <- batch[rownames(tsne.emb)]
PlotDims(tsne.emb, sample.groups = batch, pt.size = 1, alpha.plot = 0.5, label.size = 8, do.label = T,
show.legend = F, seed = plot.seed)
#dev.off()
## tSNE plot overlay
# plot.seed <- 32907098
# tsne.emb <- Embeddings(object = r, reduction = "tsne")
TF <- "NEUROD1"
tf.groups <- as.character(clusters); names(tf.groups) <- names(clusters);
tf.groups[names(tf.groups) %in% genotypes.list[[TF]]] <- TF
tf.groups[!names(tf.groups) %in% genotypes.list[[TF]]] <- ""
tf.groups <- factor(tf.groups)
#pdf(paste0(file.handle, "_tsne_plot_", TF, ".pdf"), width = 3, height = 3)
PlotDims(tsne.emb, sample.groups = tf.groups, pt.size = 0.35, alpha.plot = 0.4, label.size = 0, do.label = T,
show.legend = F, seed = plot.seed) + scale_color_manual(values = c("grey", "red")) + ggtitle(TF)
#dev.off()
r <- RunUMAP(r, reduction = “pca”, dims = 1:10)
umap.emb <- Embeddings(r, reduction = “umap”) PlotDims(umap.emb, sample.groups = clusters, pt.size = 0.75, alpha.plot = 0.5, label.size = 6, do.label = T, show.legend = F, seed = plot.seed)
DimPlot(r, reduction = “umap”) DimPlot(r, reduction = “tsne”) DimPlot(r, reduction = “pca”)
library(FlowSOM) library(flowCore) library(Biobase) library(ggplot2) library(hexbin) library(viridis) library(ggExtra) library(RColorBrewer) library(MEM) library(tidyverse) library(Rtsne) library(uwot)
# Time ~1 min
set.seed(OVERALL_SEED)
# Run UMAP on all scaled surface markers
# the line below will run UMAP on the data set (to see help page for UMAP, type
# "?UMAP -- enter" in console)
# you can view UMAP progress by opening up the console below
myumap <-
umap(transformed.chosen.markers, # input scaled data
n_neighbors = 15, # number of nearest neighbors to look at,
# scales with data set
n_threads = 1, # this makes UMAP reproducible
verbose = TRUE)
umap.data = as.data.frame(myumap)
cat("\n\n...'run_UMAP' finished running")
Generate UMAP plots